In [73]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import  RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.tree import plot_tree
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")
import plotly.io as pio
pio.renderers.default = 'notebook'
In [74]:
df = pd.read_csv('Dataset_for_Food_and_Beverages.csv')
df.head()
Out[74]:
Shop_Id Shop_Name Shop_Location Shop_Type Shop_Website Yearly_Sales Average_Order_Value Foot_Traffic Marketing Rating
0 1 Nepal Cafe Maharajgunj Bistro Yes 5821831 423 96 No 4.3
1 2 Everest Cake Shop Baneshwor Grill Yes 7381237 506 101 No 4.8
2 3 Birat Grill Bhaktapur Restaurant No 9057127 174 251 Yes 5.0
3 4 Birat Grocery Bhaktapur Grocery Yes 4254663 238 80 Yes 3.5
4 5 Chitwan Grocery Lalitpur Grocery No 3122248 60 218 Yes 2.0

Data Cleaning¶

In [75]:
# Sorting
df = df.sort_values("Shop_Name")
df
Out[75]:
Shop_Id Shop_Name Shop_Location Shop_Type Shop_Website Yearly_Sales Average_Order_Value Foot_Traffic Marketing Rating
117 118 Annapurna Bakery Jawalakhel Bakery Yes 3436104 65 197 Yes 2.0
19 20 Annapurna Bistro Jawalakhel Bistro Yes 8025449 506 139 Yes 4.8
169 170 Annapurna Bistro Maharajgunj Bistro Yes 8324338 548 164 No 4.8
99 100 Annapurna Cafe Patan Bistro No 5265481 347 71 Yes 3.0
71 72 Annapurna Cake Shop Patan Bistro No 5761502 417 82 Yes 4.3
... ... ... ... ... ... ... ... ... ... ...
123 124 Pokhara Quick Stop Jawalakhel Bistro Yes 6288847 375 104 No 4.5
155 156 Pokhara Restaurant Bhaktapur Restaurant Yes 8998621 248 270 No 5.0
5 6 Pokhara Restaurant Jawalakhel Restaurant No 4148829 298 100 Yes 3.5
17 18 Pokhara Supermarket Lalitpur Grocery Yes 3725943 286 92 No 3.0
167 168 Pokhara Supermarket Baneshwor Grocery No 6674133 494 127 No 4.8

200 rows × 10 columns

In [76]:
df.isnull().sum()
Out[76]:
Shop_Id                0
Shop_Name              0
Shop_Location          0
Shop_Type              0
Shop_Website           0
Yearly_Sales           0
Average_Order_Value    0
Foot_Traffic           0
Marketing              0
Rating                 0
dtype: int64
In [77]:
# Dropping Unnecessary Columns
df = df.drop("Shop_Id", axis=1)
df.head()
Out[77]:
Shop_Name Shop_Location Shop_Type Shop_Website Yearly_Sales Average_Order_Value Foot_Traffic Marketing Rating
117 Annapurna Bakery Jawalakhel Bakery Yes 3436104 65 197 Yes 2.0
19 Annapurna Bistro Jawalakhel Bistro Yes 8025449 506 139 Yes 4.8
169 Annapurna Bistro Maharajgunj Bistro Yes 8324338 548 164 No 4.8
99 Annapurna Cafe Patan Bistro No 5265481 347 71 Yes 3.0
71 Annapurna Cake Shop Patan Bistro No 5761502 417 82 Yes 4.3
In [78]:
# Index Reset
df= df.reset_index(drop=True)
df
Out[78]:
Shop_Name Shop_Location Shop_Type Shop_Website Yearly_Sales Average_Order_Value Foot_Traffic Marketing Rating
0 Annapurna Bakery Jawalakhel Bakery Yes 3436104 65 197 Yes 2.0
1 Annapurna Bistro Jawalakhel Bistro Yes 8025449 506 139 Yes 4.8
2 Annapurna Bistro Maharajgunj Bistro Yes 8324338 548 164 No 4.8
3 Annapurna Cafe Patan Bistro No 5265481 347 71 Yes 3.0
4 Annapurna Cake Shop Patan Bistro No 5761502 417 82 Yes 4.3
... ... ... ... ... ... ... ... ... ...
195 Pokhara Quick Stop Jawalakhel Bistro Yes 6288847 375 104 No 4.5
196 Pokhara Restaurant Bhaktapur Restaurant Yes 8998621 248 270 No 5.0
197 Pokhara Restaurant Jawalakhel Restaurant No 4148829 298 100 Yes 3.5
198 Pokhara Supermarket Lalitpur Grocery Yes 3725943 286 92 No 3.0
199 Pokhara Supermarket Baneshwor Grocery No 6674133 494 127 No 4.8

200 rows × 9 columns

In [79]:
df['Shop_Type'].unique()
Out[79]:
array(['Bakery', 'Bistro', 'Café', 'Restaurant', 'Convenience Store',
       'Grocery', 'Lounge', 'Supermarket', 'Essentials', 'Grill'],
      dtype=object)
In [80]:
# Shop Type Mapping
df['Shop_Type'] = df['Shop_Type'].replace({
    'Café': 'Cafe',
    'Grocery': 'Grocery',
    'Restaurant': 'Restaurant',
    'Bistro': 'Bistro',
    'Bakery': 'Bakery',
    'Convenience Store': 'Convenience Store',
    'Lounge': 'Lounge'
})
df['Shop_Type'] = df['Shop_Type'].str.title()
In [81]:
# Website Mapping (0,1)
df['Shop_Website'] = df['Shop_Website'].replace({
    'Yes': 1,
    'No': 0
})
df.head()
Out[81]:
Shop_Name Shop_Location Shop_Type Shop_Website Yearly_Sales Average_Order_Value Foot_Traffic Marketing Rating
0 Annapurna Bakery Jawalakhel Bakery 1 3436104 65 197 Yes 2.0
1 Annapurna Bistro Jawalakhel Bistro 1 8025449 506 139 Yes 4.8
2 Annapurna Bistro Maharajgunj Bistro 1 8324338 548 164 No 4.8
3 Annapurna Cafe Patan Bistro 0 5265481 347 71 Yes 3.0
4 Annapurna Cake Shop Patan Bistro 0 5761502 417 82 Yes 4.3
In [82]:
# Marketing Column Mapping
marketing_map = {
    'Yes': 1,
    'No': 0,
}
df['Marketing'] = df['Marketing'].map(marketing_map)
df.head()
Out[82]:
Shop_Name Shop_Location Shop_Type Shop_Website Yearly_Sales Average_Order_Value Foot_Traffic Marketing Rating
0 Annapurna Bakery Jawalakhel Bakery 1 3436104 65 197 1 2.0
1 Annapurna Bistro Jawalakhel Bistro 1 8025449 506 139 1 4.8
2 Annapurna Bistro Maharajgunj Bistro 1 8324338 548 164 0 4.8
3 Annapurna Cafe Patan Bistro 0 5265481 347 71 1 3.0
4 Annapurna Cake Shop Patan Bistro 0 5761502 417 82 1 4.3
In [83]:
# Ratings Category (Low, Medium, High)
def categorize_rating(rating):
    if rating <= 2.5:
        return 'Low'
    elif rating <= 4.0:
        return 'Medium'
    else:
        return 'High'

df['Rating_Category'] = df['Rating'].apply(categorize_rating)
df.head()
Out[83]:
Shop_Name Shop_Location Shop_Type Shop_Website Yearly_Sales Average_Order_Value Foot_Traffic Marketing Rating Rating_Category
0 Annapurna Bakery Jawalakhel Bakery 1 3436104 65 197 1 2.0 Low
1 Annapurna Bistro Jawalakhel Bistro 1 8025449 506 139 1 4.8 High
2 Annapurna Bistro Maharajgunj Bistro 1 8324338 548 164 0 4.8 High
3 Annapurna Cafe Patan Bistro 0 5265481 347 71 1 3.0 Medium
4 Annapurna Cake Shop Patan Bistro 0 5761502 417 82 1 4.3 High

Visualizations¶

In [84]:
# count the each shop type
shop_type_counts = df['Shop_Type'].value_counts().reset_index()
shop_type_counts.columns = ['Shop_Type', 'Count']

# pie
fig = px.pie(shop_type_counts, names='Shop_Type', values='Count', title='Shop Type Distribution')
fig.update_layout(
    template='plotly_white',
    width=800,
    height=600
)
fig.show()
In [85]:
# visualization the shop type with most foot traffic
shop_type_foot_traffic = df.groupby('Shop_Type')['Foot_Traffic'].mean().reset_index()
shop_type_foot_traffic = shop_type_foot_traffic.sort_values(by='Foot_Traffic', ascending=False)
#histogram
fig = px.histogram(shop_type_foot_traffic, x='Shop_Type', y='Foot_Traffic', title='Shop Type vs Foot Traffic',color='Shop_Type')
fig.update_layout(
    xaxis_title='Shop Type',
    yaxis_title='Foot Traffic',
    template='plotly_white',
    width=800,
    height=600
)
fig.show()
In [86]:
#visualization based on ratings and Shop Type
shop_type_rating = df.groupby('Shop_Type')['Rating'].mean().reset_index()
shop_type_rating = shop_type_rating.sort_values(by='Rating', ascending=False)

#line
fig = px.line(shop_type_rating, x='Shop_Type', y='Rating', title='Shop Type vs Rating')
fig.update_layout(
    xaxis_title='Shop Type',
    yaxis_title='Rating',
    template='plotly_white',
    width=800,
    height=600
)
fig.update_xaxes(tickangle=45)
fig.show()
In [87]:
#visualization of marketing and yearly sales
marketing_sales = df.groupby('Marketing')['Yearly_Sales'].mean().reset_index()
marketing_sales = marketing_sales.sort_values(by='Yearly_Sales', ascending=False)

#bar
fig = px.bar(marketing_sales, x='Marketing', y='Yearly_Sales', title='Yearly Sales by Marketing', color='Yearly_Sales')
fig.update_layout(
    xaxis_title='Marketing',
    yaxis_title='Yearly Sales',
    template='plotly_white',
    width=800,
    height=600
)
fig.show()
In [88]:
# Visualization of Shop Type and Yearly Sales 
shop_type_sales = df.groupby('Shop_Type')['Yearly_Sales'].mean().reset_index()
shop_type_sales = shop_type_sales.sort_values(by='Yearly_Sales', ascending=False)

# bar
fig = px.bar(shop_type_sales, x='Shop_Type', y='Yearly_Sales', title='Yearly Sales by Shop Type', color='Yearly_Sales', color_continuous_scale='magma')
fig.update_layout(
    xaxis_title='Shop Type',
    yaxis_title='Yearly Sales',
    template='plotly_white',
    width=800,
    height=600
)
fig.update_xaxes(tickangle=45)
fig.show()
In [89]:
# Average order value by shop website
shop_website_avg_order = df.groupby('Shop_Website')['Average_Order_Value'].mean().reset_index()
shop_website_avg_order = shop_website_avg_order.sort_values(by='Average_Order_Value', ascending=False)

# Bar
fig = px.bar(shop_website_avg_order, x='Shop_Website', y='Average_Order_Value', title='Average Order Value by Shop Website', color='Average_Order_Value', color_continuous_scale='temps')
fig.update_layout(
    xaxis_title='Shop Website',
    yaxis_title='Average Order Value',
    template='plotly_white',
    width=800,
    height=600
)
fig.update_xaxes(tickangle=45)
fig.show()
In [90]:
# Shop location and foot traffic
shop_location_foot_traffic = df.groupby('Shop_Location')['Foot_Traffic'].mean().reset_index()
shop_location_foot_traffic.columns = ['Shop_Location', 'Average_Foot_Traffic']

# Scatter
fig = px.scatter(shop_location_foot_traffic, x='Shop_Location', y='Average_Foot_Traffic', title='Shop Location by Foot Traffic', color='Average_Foot_Traffic')
fig.update_layout(
    xaxis_title='Shop Location',
    yaxis_title='Average Foot Traffic',
    template='plotly_white',
    width=800,
    height=600
)
fig.show()
In [91]:
# visualization of shop type and average order value
shop_type_avg_order = df.groupby('Shop_Type')['Average_Order_Value'].mean().reset_index()
shop_type_avg_order = shop_type_avg_order.sort_values(by='Average_Order_Value', ascending=False)

# Bar
fig = px.bar(shop_type_avg_order, x='Shop_Type', y='Average_Order_Value', title='Average Order Value by Shop Type', color='Average_Order_Value', color_continuous_scale='viridis')
fig.update_layout(
    xaxis_title='Shop Type',
    yaxis_title='Average Order Value',
    template='plotly_white',
    width=800,
    height=600
)
fig.update_xaxes(tickangle=45)
fig.show()
In [92]:
#average sales by shop location
shop_location_sales = df.groupby('Shop_Location')['Yearly_Sales'].mean().reset_index()
shop_location_sales = shop_location_sales.sort_values(by='Yearly_Sales', ascending=False)

# hist
fig = px.histogram(shop_location_sales, x='Shop_Location', y='Yearly_Sales', title='Shop Location vs Yearly Sales', color='Yearly_Sales')
fig.update_layout(
    xaxis_title='Shop Location',
    yaxis_title='Yearly Sales',
    template='plotly_white',
    width=800,
    height=600
)
fig.show()
In [93]:
# Average Order Value by Shop Location
shop_location_avg_order = df.groupby('Shop_Location')['Average_Order_Value'].mean().reset_index()
fig = px.pie(shop_location_avg_order, names='Shop_Location', values='Average_Order_Value', title='Average Order Value by Shop Location')
fig.update_layout(template='plotly_white', width=800, height=600)
fig.show()

Model Training¶

In [94]:
df.columns
Out[94]:
Index(['Shop_Name', 'Shop_Location', 'Shop_Type', 'Shop_Website',
       'Yearly_Sales', 'Average_Order_Value', 'Foot_Traffic', 'Marketing',
       'Rating', 'Rating_Category'],
      dtype='object')
In [95]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Shop_Name            200 non-null    object 
 1   Shop_Location        200 non-null    object 
 2   Shop_Type            200 non-null    object 
 3   Shop_Website         200 non-null    int64  
 4   Yearly_Sales         200 non-null    int64  
 5   Average_Order_Value  200 non-null    int64  
 6   Foot_Traffic         200 non-null    int64  
 7   Marketing            200 non-null    int64  
 8   Rating               200 non-null    float64
 9   Rating_Category      200 non-null    object 
dtypes: float64(1), int64(5), object(4)
memory usage: 15.8+ KB
In [96]:
df.describe()
Out[96]:
Shop_Website Yearly_Sales Average_Order_Value Foot_Traffic Marketing Rating
count 200.000000 2.000000e+02 200.000000 200.000000 200.00000 200.000000
mean 0.555000 6.160280e+06 320.485000 148.615000 0.50500 3.930000
std 0.498213 2.129283e+06 156.265107 63.782662 0.50123 1.076286
min 0.000000 2.530677e+06 55.000000 60.000000 0.00000 2.000000
25% 0.000000 4.273359e+06 200.750000 96.000000 0.00000 3.000000
50% 1.000000 6.309304e+06 289.500000 131.000000 1.00000 4.300000
75% 1.000000 8.104099e+06 473.250000 207.000000 1.00000 4.800000
max 1.000000 9.489331e+06 599.000000 270.000000 1.00000 5.000000
In [97]:
corr = df.corr(numeric_only=1)
corr
Out[97]:
Shop_Website Yearly_Sales Average_Order_Value Foot_Traffic Marketing Rating
Shop_Website 1.000000 0.138697 0.082630 0.014507 0.059263 0.148724
Yearly_Sales 0.138697 1.000000 0.440359 0.407206 0.065382 0.935705
Average_Order_Value 0.082630 0.440359 1.000000 -0.468402 0.003594 0.593349
Foot_Traffic 0.014507 0.407206 -0.468402 1.000000 0.042893 0.193354
Marketing 0.059263 0.065382 0.003594 0.042893 1.000000 0.056542
Rating 0.148724 0.935705 0.593349 0.193354 0.056542 1.000000
In [98]:
# Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='magma', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()
No description has been provided for this image

Random forest Regressor¶

In [99]:
features_reg = df[
    [
        "Shop_Website",
        "Marketing",
        "Rating",
        "Average_Order_Value",
        "Foot_Traffic"
    ]
]
target_reg = df["Yearly_Sales"]

# Split the data into training and testing sets
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(features_reg, target_reg, test_size=0.2, random_state=42)

regressor = RandomForestRegressor()
regressor.fit(X_train_reg, y_train_reg)

y_pred_rge = regressor.predict(X_test_reg)
mae = mean_absolute_error(y_test_reg, y_pred_rge)
r2 = r2_score(y_test_reg, y_pred_rge)
print(f"Mean Absolute Error: {mae}")
print(f"R2 Score: {r2}")
Mean Absolute Error: 333324.6024999999
R2 Score: 0.9515501455559853
In [100]:
# Visualization of predicted vs actual costs
fig_cost = px.scatter(x=y_test_reg, y=y_pred_rge, labels={'x': 'Actual Cost', 'y': 'Predicted Cost'}, title='Actual vs Predicted Cost')
fig_cost.add_shape(
    type="line", line=dict(dash='dash'),
    x0=y_test_reg.min(), y0=y_test_reg.min(),
    x1=y_test_reg.max(), y1=y_test_reg.max()
)
fig_cost.update_layout(paper_bgcolor="white")
fig_cost.show()

cost_list = [[predicted] for actual, predicted in zip(y_test_reg, y_pred_rge)]
In [101]:
#plot decision tree of the random forest regressor
plt.figure(figsize=(20, 10))
plot_tree(regressor.estimators_[0], filled=True, feature_names=features_reg.columns)
plt.show()
No description has been provided for this image

Random Forest Classifier¶

In [102]:
from sklearn.metrics import accuracy_score, classification_report


features_clf = df[
    [
        'Shop_Website',
        'Yearly_Sales',
        'Average_Order_Value',
        'Foot_Traffic',
        'Marketing'
    ]
]
target_clf = df['Rating_Category']

X_train, X_test, y_train, y_test = train_test_split(features_clf, target_clf, test_size=0.2, random_state=42)

model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
In [103]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['Low', 'Medium', 'High'])

print("Accuracy:", accuracy)
print("Classification Report:\n", report)
Accuracy: 0.95
Classification Report:
               precision    recall  f1-score   support

         Low       0.92      1.00      0.96        23
      Medium       1.00      1.00      1.00         2
        High       1.00      0.87      0.93        15

    accuracy                           0.95        40
   macro avg       0.97      0.96      0.96        40
weighted avg       0.95      0.95      0.95        40

In [104]:
import plotly.figure_factory as ff
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
class_names = ['Low', 'Medium', 'High']
fig_cm = ff.create_annotated_heatmap(
    z=cm, 
    x=[f'Predicted {label}' for label in class_names], 
    y=[f'Actual {label}' for label in class_names], 
    colorscale='viridis'
)
fig_cm.update_layout(
    title='Confusion Matrix',
    xaxis=dict(title='Predicted Label'),
    yaxis=dict(title='Actual Label'),
    paper_bgcolor="white"
)
fig_cm.show()

cm
Out[104]:
array([[23,  0,  0],
       [ 0,  2,  0],
       [ 2,  0, 13]])

Hyperparameter tuning and cross validation¶

In [105]:
#hyperparameter tuning and cross validation for RandomForestRegressor
rfc = RandomForestRegressor(
    random_state=42, 
    n_estimators=300,         
    max_depth=3,           
    min_samples_split=2,      
    min_samples_leaf=2,       
)

rfc.fit(X_train_reg, y_train_reg)

y_pred_reg = rfc.predict(X_test_reg)

cv_scores_r2 = cross_val_score(rfc, features_reg, target_reg, cv=5, scoring='r2')
cv_scores_mae = cross_val_score(rfc, features_reg, target_reg, cv=5, scoring='neg_mean_absolute_error')
cv_scores_mae = -cv_scores_mae 
print("Mean R2 score:", cv_scores_r2.mean())
print("Mean MAE score:", cv_scores_mae.mean())
Mean R2 score: 0.9632741545892657
Mean MAE score: 326892.12028971047
In [106]:
# actual and predicted values after hyperparameter tuning and cross validation
fig_cost = px.scatter(x=y_test_reg, y=y_pred_reg, labels={'x': 'Actual Cost', 'y': 'Predicted Cost'}, title='Actual vs Predicted Cost')
fig_cost.add_shape(
    type="line", line=dict(dash='dash'),
    x0=y_test_reg.min(), y0=y_test_reg.min(),
    x1=y_test_reg.max(), y1=y_test_reg.max()
)
fig_cost.update_layout(paper_bgcolor="white")
fig_cost.show()
In [107]:
#hyperparameter tuning and cross validation for RandomForestClassifier
rfc = RandomForestClassifier(
    random_state=42, 
    n_estimators=300,         
    max_depth=3,           
    min_samples_split=2,      
    min_samples_leaf=2,       
)

rfc.fit(X_train, y_train)
y_pred_rfc = rfc.predict(X_test)

cv_scores_rfc = cross_val_score(rfc, features_clf, target_clf, cv=5, scoring='accuracy')
print("Cross-validation accuracy scores for Random Forest Classifier:", cv_scores_rfc)
print("Mean accuracy score:", cv_scores_rfc.mean())
Cross-validation accuracy scores for Random Forest Classifier: [0.95 0.9  1.   1.   1.  ]
Mean accuracy score: 0.97
In [108]:
# df.to_csv('cleaned_food_and_beverages.csv')

#to view the actual and predicted values
# for actual, predicted in zip(y_test_reg, y_pred_gbr):
#     print(f"Actual: {actual}, Predicted: {int(predicted)}")